In [1]:
import sys, os
sys.path.insert(0, os.path.join(os.path.abspath(os.pardir)))
import numpy as np
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from src.models.convnets import ConvolutionalNet
from src.preprocessors.preprocess_text import clean, mark_unknown_words
from src.train import words_to_indices, SEQUENCE_LENGTH, EMBEDDING_DIMENSION, MODEL_FILE
In [2]:
vocabulary = open("../data/vocabulary.txt").read().split("\n")
inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))
clickbait = open("../data/reddit/clickbait-reddit.txt").read() + "\n"
clickbait += open("../data/reddit/clickbait-top-reddit.txt").read()
clickbait = clean(clickbait)
clickbait = clickbait.split("\n")
clickbait = list(set(clickbait))
clickbait = [mark_unknown_words(vocabulary, title) for title in clickbait]
print "Clickbait"
print "-" * 50
for i, each in enumerate(clickbait[:5]):
print "{0}. {1}".format(i+1, each)
print "-" * 50
genuine = open("../data/reddit/genuine-reddit.txt").read() + "\n"
genuine += open("../data/reddit/news-reddit.txt").read()
genuine = clean(genuine)
genuine = genuine.split("\n")
genuine = list(set(genuine))
genuine = [mark_unknown_words(vocabulary, title) for title in genuine]
print "Genuine"
print "-" * 50
for i, each in enumerate(genuine[:5]):
print "{0}. {1}".format(i+1, each)
print "-" * 50
In [3]:
C = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
G = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in genuine], maxlen=SEQUENCE_LENGTH)
X = np.concatenate([C, G], axis=0)
y = np.array([[1] * C.shape[0] + [0] * G.shape[0]], dtype=np.int32).T
p = np.random.permutation(y.shape[0])
X = X[p]
y = y[p]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
In [4]:
params = dict(vocabulary_size=len(vocabulary), embedding_dimension=EMBEDDING_DIMENSION, input_length=SEQUENCE_LENGTH)
model = ConvolutionalNet(**params)
In [5]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
SVG(model_to_dot(model).create(prog="dot", format="svg"))
Out[5]:
In [22]:
from keras.layers import Convolution1D, Dense
from keras.regularizers import activity_l1l2
from keras.optimizers import SGD, Adam
model.load_weights("../models/detector.h5")
for layer in model.layers:
layer.trainable = False
for layer in model.layers[1:]:
if type(layer) == Dense or type(layer) == Convolution1D:
layer.W_regularizer=None
layer.activity_regularizer=activity_l1l2(0.05)
layer.trainable = True
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])
In [23]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, nb_epoch=5, shuffle=True)
Out[23]:
In [24]:
model.save_weights("../models/detector.finetuned.h5")